{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.grid_search import GridSearchCV\n",
    "\n",
    "df = pd.read_csv('./ad.data', header=None)\n",
    "\n",
    "explanatory_variable_columns = set(df.columns.values)\n",
    "explanatory_variable_columns.remove(len(df.columns.values)-1)\n",
    "response_variable_column = df[len(df.columns.values)-1] # The last column describes the classes\n",
    "\n",
    "y = [1 if e == 'ad.' else 0 for e in response_variable_column]\n",
    "X = df[list(explanatory_variable_columns)].copy()\n",
    "X.replace(to_replace=' *?', value=-1, regex=True, inplace=True)\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y)\n",
    "\n",
    "pipeline = Pipeline([\n",
    "    ('clf', DecisionTreeClassifier(criterion='entropy'))\n",
    "])\n",
    "parameters = {\n",
    "    'clf__max_depth': (150, 155, 160),\n",
    "    'clf__min_samples_split': (2, 3),\n",
    "    'clf__min_samples_leaf': (1, 2, 3)\n",
    "}\n",
    "\n",
    "grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='f1')\n",
    "grid_search.fit(X_train, y_train)\n",
    "\n",
    "best_parameters = grid_search.best_estimator_.get_params()\n",
    "print('Best score: %0.3f' % grid_search.best_score_)\n",
    "print('Best parameters set:')\n",
    "for param_name in sorted(parameters.keys()):\n",
    "    print('t%s: %r' % (param_name, best_parameters[param_name]))\n",
    "\n",
    "predictions = grid_search.predict(X_test)\n",
    "print(classification_report(y_test, predictions))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 3 folds for each of 18 candidates, totalling 54 fits\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.4s\n",
      "[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:    6.6s finished\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best score: 0.887\n",
      "Best parameters set:\n",
      "tclf__max_depth: 150\n",
      "tclf__min_samples_leaf: 1\n",
      "tclf__min_samples_split: 3\n",
      "             precision    recall  f1-score   support\n",
      "\n",
      "          0       0.98      0.99      0.98       717\n",
      "          1       0.92      0.83      0.87       103\n",
      "\n",
      "avg / total       0.97      0.97      0.97       820\n",
      "\n"
     ]
    }
   ],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
